# Purpose of the code:
# Plot all feature values across days of study & MSFC scores on a same graph
# plot the regression lines that can best fit to feature values and MSFC scores in visits 2 & 3
# necessary imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math as math
from scipy.stats import pearsonr, betai
%matplotlib inline
# switch to a proper directiry to access the data
pwd
cd /camdatalake/bronze/verily_ms/device/
pwd
# check the content of the directory
ls
# download and read the data
# FeatureDay: Average value of the features for each day of study. Days are listed as
# DayOfStudy
# FeatureStudy: Features for the entire study period.For the at home features,
# the reported value is the median of the observed day level values.
import gzip, csv
with gzip.open("FeaturesDay.csv.gz", "rt", newline="") as file:
FeatureDay = pd.read_csv(file, header = 0)
with gzip.open("FeaturesStudy.csv.gz", "rt", newline="") as file:
FeatureStudy = pd.read_csv(file, header = 0)
# explore the dataset
FeatureDay.info()
FeatureDay.describe()
FeatureDay.head()
# extract names of features in the dataset
list(FeatureDay.columns)
# found list of unique IDs for patients
patient_IDs = list(FeatureDay['gls_subject_code'].unique())
patient_IDs
# 10 free living feature with high correlation
free_living_features_highly_correlated = ['idle_minutes',
'turn_vel_std_ankle',
'swing',
'stance',
'duration_movement_count',
'turn_vel_max_ankle',
'turn_duration_ankle',
'duration_rem_count',
'rem_percent',
'movement_rate']
# 19 highly correlated at home features (structured activity)
at_home_features_highly_correlated = ['mean_pvt_delay_7_at_home',
'mobility_stance_at_home',
'mean_pvt_delay_at_home',
'pq_nondominant_rhythm_at_home',
'pq_nondominant_median_at_home',
'pq_dominant_rhythm_at_home',
'turn_vel_max_at_home',
'mobility_swing_at_home',
'zx_dominant_num_correct_at_home',
'turn_vel_std_at_home',
'turn_duration_ankle_at_home',
'turn_vel_max_ankle_at_home',
'mean_pvt_delay_5_at_home',
'zx_nondominant_median_at_home',
'zx_nondominant_num_correct_at_home',
'mean_pvt_delay_3_at_home',
'turn_vel_std_ankle_at_home',
'mobility_activity_at_home_time',
'mean_pvt_delay_1_at_home']
# features related to MSFC scores (all components including composite scores)
FeatureStudy_columns_MSFC_col_names = ['msfc_walk_composite_1',
'msfc_9hpt_composite_1',
'msfc_sdmt_composite_1',
'msfc_snellen_composite_1',
'msfc_composite_1',
'msfc_walk_composite_residual_1',
'msfc_9hpt_composite_residual_1',
'msfc_sdmt_composite_residual_1',
'msfc_snellen_composite_residual_1',
'msfc_walk_composite_2',
'msfc_9hpt_composite_2',
'msfc_sdmt_composite_2',
'msfc_snellen_composite_2',
'msfc_composite_2',
'msfc_walk_composite_residual_2',
'msfc_9hpt_composite_residual_2',
'msfc_sdmt_composite_residual_2',
'msfc_snellen_composite_residual_2',
'msfc_walk_composite_3',
'msfc_9hpt_composite_3',
'msfc_sdmt_composite_3',
'msfc_snellen_composite_3',
'msfc_composite_3',
'msfc_walk_composite_residual_3',
'msfc_9hpt_composite_residual_3',
'msfc_sdmt_composite_residual_3',
'msfc_snellen_composite_residual_3']
# features related to MSFC scores (composite scores)
FeatureStudy_columns_MSFC_composite_col_names = ['msfc_composite_1', 'msfc_composite_2', 'msfc_composite_3']
# breaking down FeatureDay dataframe
FeatureDay_free_living = FeatureDay[free_living_features_highly_correlated]
FeatureDay_at_home = FeatureDay[at_home_features_highly_correlated]
FeaFeatureDay_MSFC_score_all = FeatureDay[FeatureStudy_columns_MSFC_col_names]
FeaFeatureDay_MSFC_score_composite = FeatureDay[FeatureStudy_columns_MSFC_composite_col_names]
# create a data frame with patients' IDs, msfc_composite_2 and mafc_composite_3
patient_ID = []
visit2_MSFC_composite = []
visit3_MSFC_composite = []
for idx in range(len(patient_IDs)):
ID = patient_IDs[idx]
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][['msfc_composite_2', 'msfc_composite_3']]
patient_ID.append(ID)
visit2_MSFC_composite.append(df.iloc[0]['msfc_composite_2'])
visit3_MSFC_composite.append(df.iloc[0]['msfc_composite_3'])
all_data = []
all_data.append(patient_ID)
all_data.append(visit2_MSFC_composite)
all_data.append(visit3_MSFC_composite)
all_data = list(zip(*all_data))
df_MSFC_composite = pd.DataFrame(all_data,columns=['patient_ID','visit2_MSFC_composite','visit3_MSFC_composite'])
df_MSFC_composite.head()
def remove_outliers(feature_values, day_of_study):
# a function to remove outliers from input dataset and return filtered dataset as the ouput
m = 2 # stance threshold from the mean
mean = feature_values.mean()
std = feature_values.std()
tuples = list(zip(feature_values,day_of_study))
filtered_values = []
for (x,y) in tuples:
if (x >= mean - m * std) & (x <= mean + m * std):
filtered_values.append((x,y))
unzip_filtered_values = list(zip(*filtered_values))
# check for missing values
if len(unzip_filtered_values) > 0:
return pd.Series(list(unzip_filtered_values[0])), pd.Series(list(unzip_filtered_values[1]))
else:
return pd.Series([]),day_of_study
def standardize_axis(feature):
# a function to standardize the axis
# remove outliers (both feature values & associated days of study), return filtered values
# use the filtered values to assign a range to axis
# we assume dataframes FeatureDay and patient_IDs are already defined
All_filtered_feature_values = []
All_filtered_days_of_studies = []
# loop on all the patients
for ID in patient_IDs:
# extract part of FeatureDay that is related to a patient and input feature
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
# create list of x: days of study, y: feature values
x = df[col_2]
y = df[col_1]
# remove outliers (both feature values & associated day of study)
y,x = remove_outliers(y,x)
# store all the filtered values
All_filtered_feature_values = All_filtered_feature_values + (pd.Series.tolist(y))
All_filtered_days_of_studies = All_filtered_days_of_studies + (pd.Series.tolist(x))
# set the axis ranges to the max value in the list of filtered values
max_y = (np.max(All_filtered_feature_values))
max_x = (np.max(All_filtered_days_of_studies))
# return the axis ranges
return max_y,max_x
def plot_feature_across_days_and_composite_scores(feature):
# plot the measurments for a specific feature vs. days
# plot MSFC composite scores for visits 2 and 3 on a same graph
figs, axes = plt.subplots(nrows= 5, ncols= 5,figsize=(20,20),dpi = 200)
print(feature)
# plot the measurments vs. days of study
for idx in range(len(patient_IDs)):
# extract the patient ID
ID = patient_IDs[idx]
# extract two columns as a dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
x = df[col_2]
y = df[col_1]
# set the row and column numbers based on the fact that we have 25 patients
row = idx // 5
col = idx % 5
# standardize the axis
max_y,max_x = standardize_axis(feature)
axes[row,col].set_xlim(0, max_x)
# MSFC scores can be negative
# standardize the axis to consider negative scores
axes[row,col].set_ylim(-1*max_y, max_y)
axes[row,col].set_title(ID,y=0.9)
axes[row,col].set_xlabel('Days of Study')
axes[row,col].set_ylabel(feature)
# plot the measurments vs. days of study
if (len(y.unique()) == 1) & (np.isnan(y.unique()).sum() == 1):
pass
else:
y,x = remove_outliers(y,x)
if len(y) == 0:
pass
else:
sns.regplot(x,y,ax=axes[row,col],label = 'device')
axes[row,col].set_xlabel('Days of Study')
axes[row,col].set_ylabel(feature)
# plot MSFC scores for visits 2 & 3 on the same graph
col_1_new = 'visit2_MSFC_composite'
col_2_new = 'visit3_MSFC_composite'
x_new = [0,max_x]
y_new = list(df_MSFC_composite[df_MSFC_composite['patient_ID'] == ID][[col_1_new,col_2_new]].iloc[0])
# MSFC scores are very small. Scale them.
y_new = [(max_y/2) * i for i in y_new]
axes[row,col].plot(x_new,y_new,label='MSFC')
axes[row,col].legend(loc = 3)
# plot free living features over days of study
# plot the regression line that can fit to feature values
# plot a regression line for MSFC composite scores for visits 2 and 3
feature = free_living_features_highly_correlated[0]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[1]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[2]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[3]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[4]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[5]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[6]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[7]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[8]
plot_feature_across_days_and_composite_scores(feature)
feature = free_living_features_highly_correlated[9]
plot_feature_across_days_and_composite_scores(feature)
# plot at home (structured activity) features over days of study
# plot the regression line that can fit to feature values
# plot a regression line for MSFC composite scores for visits 2 and 3
feature = at_home_features_highly_correlated[0]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[1]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[2]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[3]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[4]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[5]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[6]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[7]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[8]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[9]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[10]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[11]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[12]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[13]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[14]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[15]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[16]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[17]
plot_feature_across_days_and_composite_scores(feature)
feature = at_home_features_highly_correlated[18]
plot_feature_across_days_and_composite_scores(feature)